1 Introduction

The transcription factors (TF) and DNA recognitions depend on multiples levels of interactions. The first level involves chromatin accessibility, where nuclesosome-depleted regions are highly associated with TFs binding compared to the closed chromatin, which is often inaccesible to most TFs. The second represents the existence of the consensus binding motif in the DNA sequence, a point that we are going to identify in this notebook in each cell type using Signac and ChromVar.

2 Pre-processing

2.1 Load packages

library(Seurat)
library(Signac)
library(reshape)
library(ggplot2)
library(JASPAR2020)
library(TFBSTools)
library(BSgenome.Hsapiens.UCSC.hg38)
library(patchwork)
library(chromVAR)
library(motifmatchr)
library(ggpubr)
library(data.table)
library(chromVARmotifs)
library(dplyr)
library(purrr)
library(readxl)
library(writexl)
library(pheatmap)
library(factoextra)
library(corrplot)

set.seed(1234)

2.2 Parameters

cell_type = "CD4_T"

# Paths
path_to_obj <- paste0(
  here::here("scATAC-seq/results/R_objects/level_5/"),
  cell_type,
  "/04.",
  cell_type,
  "_integration_peak_calling_level_5.rds",
  sep = ""
)


color_palette <-  c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D", 
                    "#5A5156", "#AA0DFE", "#F8A19F", "#F7E1A0",
                    "#1C8356", "#FEAF16", "#822E1C", "#C4451C", 
                    "#1CBE4F", "#325A9B", "#F6222E", "#FE00FA",
                    "#FBE426", "#16FF32",  "black",  "#3283FE",
                    "#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3", 
                    "#90AD1C", "#FE00FA", "#85660D", "#3B00FB", 
                    "#822E1C", "coral2",  "#1CFFCE", "#1CBE4F", 
                    "#3283FE", "#FBE426", "#F7E1A0", "#325A9B", 
                    "#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD",
                    "#FEAF16", "#683B79", "#B10DA1", "#1C7F93", 
                    "#F8A19F", "dark orange", "#FEAF16", 
                    "#FBE426", "Brown")

path_to_save <- paste0(
  here::here("scATAC-seq/results/R_objects/level_5/"),
  cell_type,
  "/05.",
  cell_type,
  "_chromVar_JASPAR_level_5.rds",
  sep = ""
)

path_to_save_TF_motifs <- paste0(
  here::here("scATAC-seq/results/files/"),
  cell_type,
  "/",
  cell_type,
  "_chromVar_JASPAR_level_5.xlsx",
  sep = ""
)

2.3 Functions

remove_correlated_helper <- function(mat, val, cutoff = 0.9) {
  stopifnot(nrow(mat) == length(val))
  cormat <- cor(t(mat), use = "pairwise.complete.obs")
  diag(cormat) <- NA
  keep <- seq_len(nrow(mat))
  for (i in order(val, decreasing = TRUE)) {
    if (i %in% keep) {
      toremove <- which(cormat[keep, i] >= cutoff)
      if (length(toremove) > 0) 
        keep <- keep[-toremove]
    }
  }
  return(keep)
}

3 Load data

seurat <- readRDS(path_to_obj)
seurat
## An object of class Seurat 
## 93116 features across 16383 samples within 1 assay 
## Active assay: peaks_level_5 (93116 features, 92407 variable features)
##  3 dimensional reductions calculated: umap, lsi, harmony
DimPlot(
  seurat, 
  group.by = "annotation_paper",
  cols = color_palette,
  pt.size = 0.1
)

DimPlot(
  seurat, 
  split.by = "annotation_paper",
  cols = color_palette,
  pt.size = 0.1,
  ncol = 5
) + NoLegend()

4 Adding motif information to the Seurat object

4.1 Retrieving matrices from JASPAR2020 database

JASPAR is a collection of transcription factor DNA-binding preferences, modeled as matrices. To have a detaill explanation of it, visit the following link, http://jaspar.genereg.net/about/.

opts <- list()
opts[["tax_group"]] <- "vertebrates"
pfm <- getMatrixSet(JASPAR2020, opts)
length(pfm)
## [1] 746

4.2 Add motif information

seurat <- AddMotifs(
  object = seurat,
  genome = BSgenome.Hsapiens.UCSC.hg38,
  pfm = pfm
)

seurat[["peaks_level_5"]]
## ChromatinAssay data with 93116 features for 16383 cells
## Variable features: 92407 
## Genome: 
## Annotation present: TRUE 
## Motifs present: TRUE 
## Fragment files: 9

5 Computing motif activities

The TF motif enrichments (that help us to predict potential specific cell-type regulators) previously computed are not calculated per-cell and they do not take into account the insertion sequence bias of the Tn5 transposase. To account for these issues we can use chromVAR that computes for each motif annotation and each cell a bias corrected “desviation” in accessibility from a expected accessibility based on the average of all the cells. This allows us to visualize motif activities per cell, and also provides an alternative method of identifying differentially-active motifs between cell types.

# The RunChromVAR function retrieved the deviationScores, the Z-scores for each bias corrected deviations.
seurat <- RunChromVAR(
  object = seurat,
  genome = BSgenome.Hsapiens.UCSC.hg38
)

saveRDS(seurat, path_to_save)

5.1 Averaging the Z-Score by all the cell that conform a cell-type group.

avgexpr_mat <- AverageExpression(
  seurat,
  assays = "chromvar",
  return.seurat = F,
  group.by = "ident",
  slot = "data")
res.pca <- prcomp(t(avgexpr_mat$chromvar),scale. = T)

options(repr.plot.width=6, repr.plot.height=8)
fviz_pca_ind(res.pca,
             repel = TRUE)

pheatmap(avgexpr_mat$chromvar, scale = "row",
         border_color = "black",
         cluster_rows = T,
         cluster_cols = T,
         fontsize_row= 3,
         clustering_distance_rows = "euclidean",
         clustering_distance_cols = "euclidean", 
         clustering_method = "ward.D2",
         cutree_rows = NA, 
         cutree_cols = 2)

5.2 Averaging the Z-Score & Correction proposed by chromVar

We compute the standar desviation for each annotated motif. Then, we select motifs that have a standar desviation higher than a specific theshold and use them to perform a correlation test and a principal component analysis. Note, that we are going to use the function “remove_correlated_helper”, to eliminate the variables that present a correlation greater than 0.9.

seurat_average <- AverageExpression(
  seurat,
  assays = "chromvar",
  return.seurat = T,
  group.by = "ident")


threshold = 1
matrix <- seurat_average[["chromvar"]]@data
vars <- matrixStats::rowSds(matrix, na_rm = TRUE)
boxplot(vars)
abline(h=1)

ix <- which(vars >= threshold)
ix2 <- ix[remove_correlated_helper(matrix[ix, , drop = FALSE], 
                                   vars[ix])]

cormat <- cor(matrix[ix2,], 
              use = "pairwise.complete.obs")

corrplot(cormat, type="upper", order="hclust", tl.col="black", tl.srt=45)

pc_res <- prcomp(t(matrix[ix2, ]))
fviz_pca_ind(pc_res, repel = TRUE)

5.3 Performing differential motif activity analysis.

DefaultAssay(seurat) <- 'chromvar'
Idents(seurat) <- seurat$annotation_paper

da_regions <- FindAllMarkers(
 object = seurat,
 only.pos = TRUE,
 min.pct = 0.1,
 test.use = 'LR',
 latent.vars = 'nCount_peaks_level_5'
)

motif_name <- sapply(da_regions$gene, function(x) {name(getMatrixByID(JASPAR2020, ID = x))})
da_regions$motif_name <- motif_name

families <- sapply(da_regions$gene, function(x) {tags(getMatrixByID(JASPAR2020, ID = x))$family})
da_regions$family <- families
da_regions$family <- gsub('\\s+', '',da_regions$family)

DT::datatable(da_regions)
da_regions_selected <- (da_regions[da_regions$p_val_adj < 0.005 & 
                                     da_regions$avg_log2FC > 0.5, ])

We transform the motif ID in motif name and also we add the family associated with it.

output <- split(da_regions, da_regions$cluster)
names(output) <- c("Naive", "CM Pre-non-Tfh", "CM PreTfh", "T-Trans-Mem", 
                   "T-Eff-Mem", "T-helper", "Tfh T-B border", "Tfh-LZ-GC", 
                   "GC-Tfh-SAP", "GC-Tfh-0X40", "Tfh-Mem","Eff-Tregs","non-GC-Tf-regs","GC-Tf-regs")

write_xlsx(output, path_to_save_TF_motifs)

5.3.1 Correlation between the number of potential motifs compared with the number of cells

correlation <- data.frame(table(da_regions_selected$cluster),
              table(seurat$annotation_paper))[,c(1,2,4)]

colnames(correlation) <- c("cluster","n_motifs","n_cells")

ggscatter(correlation, x = "n_cells",
          y = "n_motifs",
          add = "reg.line",                                 # Add regression line
          conf.int = TRUE,
          label = "cluster", 
          font.label = c(9, "bold", "black"),
          add.params = list(color = "blue",
                            fill = "lightgray")
          )+stat_cor(method = "pearson", 
                     label.x = 3, 
                     label.y = 500)  # Add correlation coefficient

5.3.2 Ranking motifs and save the entire output

da_regions_selected_sorted <- da_regions_selected[with(da_regions_selected,
                                                       order(cluster, -avg_log2FC)), ]

da_regions_selected_sorted$rank <- ave(da_regions_selected_sorted$avg_log2FC,
                                       da_regions_selected_sorted$cluster, 
                                       FUN = seq_along)

da_regions_selected_sorted_prepared <- da_regions_selected_sorted %>% 
  group_by(cluster) %>% top_n(20,-rank)
ggplot(da_regions_selected_sorted_prepared, 
       aes(x = rank, 
           y = avg_log2FC, 
           color = avg_log2FC)) + 
 geom_point(size = 1) +
  ggrepel::geom_label_repel(
        data = da_regions_selected_sorted_prepared, 
        aes(y = avg_log2FC, label = motif_name), 
        size = 4,
       nudge_x = 2,
        color = "black"
  ) + theme_minimal() + facet_wrap(~ cluster, ncol = 3)

We can plot the motifs group by family.

ggbarplot(da_regions_selected_sorted_prepared, "family", "avg_log2FC", 
  fill = "cluster",width = 0.5,
  size = 0.1) + theme(legend.position = "none",
  text = element_text(size=12),
  axis.text.x = element_text(angle=90, hjust=1),
  strip.text.x = element_text(size = 7, colour = "black")) + 
  facet_grid(~cluster, scales = "free_x")

name <- "GC-Tfh-SAP"
specific_cluster = da_regions_selected_sorted_prepared[da_regions_selected_sorted_prepared$cluster == name,]

ggbarplot(specific_cluster, "motif_name", "avg_log2FC", 
  fill = "family",width = 0.5,
  size = 0.1) + theme(legend.position = "none",
  text = element_text(size=10),
  axis.text.x = element_text(angle=90, hjust=1),
  strip.text.x = element_text(size = 7, colour = "black")) + 
  facet_grid(~cluster ~family, scales = "free_x")

5.3.3 Plotting UMAP TF motif activity

specific_cluster_sorted <- specific_cluster[order(specific_cluster$rank),]

FeaturePlot(
  object = seurat,
  features = specific_cluster_sorted$gene[1:4],
  min.cutoff = 'q5',
  max.cutoff = 'q95',
  pt.size = 0.1,
  ncol = 2
)

6 Session info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Motif_TF/lib/libopenblasp-r0.3.10.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] corrplot_0.84                     factoextra_1.0.7.999              pheatmap_1.0.12                   writexl_1.4.0                     readxl_1.3.1                      purrr_0.3.4                       dplyr_1.0.7                       chromVARmotifs_0.2.0              data.table_1.14.0                 ggpubr_0.4.0                      motifmatchr_1.10.0                chromVAR_1.10.0                   patchwork_1.1.1                   BSgenome.Hsapiens.UCSC.hg38_1.4.3 BSgenome_1.56.0                   rtracklayer_1.48.0                Biostrings_2.56.0                 XVector_0.28.0                    GenomicRanges_1.40.0              GenomeInfoDb_1.24.2               IRanges_2.22.1                    S4Vectors_0.26.0                  BiocGenerics_0.34.0               TFBSTools_1.26.0                  JASPAR2020_0.99.10                ggplot2_3.3.5                     reshape_0.8.8                     Signac_1.2.1                      SeuratObject_4.0.2                Seurat_4.0.3                      BiocStyle_2.16.1                 
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.1                  reticulate_1.20             R.utils_2.10.1              tidyselect_1.1.1            poweRlaw_0.70.6             RSQLite_2.2.1               AnnotationDbi_1.50.3        htmlwidgets_1.5.3           grid_4.0.3                  docopt_0.7.1                BiocParallel_1.22.0         Rtsne_0.15                  munsell_0.5.0               codetools_0.2-17            ica_1.0-2                   DT_0.16                     future_1.21.0               miniUI_0.1.1.1              withr_2.4.2                 colorspace_2.0-2            Biobase_2.48.0              knitr_1.30                  ROCR_1.0-11                 ggsignif_0.6.0              tensor_1.5                  listenv_0.8.0               labeling_0.4.2              slam_0.1-47                 GenomeInfoDbData_1.2.3      polyclip_1.10-0             bit64_4.0.5                 farver_2.1.0                rprojroot_2.0.2             parallelly_1.26.1           vctrs_0.3.8                 generics_0.1.0              xfun_0.18                   lsa_0.73.2                  ggseqlogo_0.1               R6_2.5.0                    bitops_1.0-7                spatstat.utils_2.2-0       
##  [43] DelayedArray_0.14.0         assertthat_0.2.1            promises_1.2.0.1            scales_1.1.1                gtable_0.3.0                globals_0.14.0              goftest_1.2-2               seqLogo_1.54.3              rlang_0.4.11                RcppRoll_0.3.0              splines_4.0.3               rstatix_0.6.0               lazyeval_0.2.2              broom_0.7.2                 spatstat.geom_2.2-0         BiocManager_1.30.10         yaml_2.2.1                  reshape2_1.4.4              abind_1.4-5                 crosstalk_1.1.1             backports_1.1.10            httpuv_1.6.1                tools_4.0.3                 bookdown_0.21               nabor_0.5.0                 ellipsis_0.3.2              spatstat.core_2.2-0         RColorBrewer_1.1-2          ggridges_0.5.3              Rcpp_1.0.6                  plyr_1.8.6                  zlibbioc_1.34.0             RCurl_1.98-1.2              rpart_4.1-15                deldir_0.2-10               pbapply_1.4-3               cowplot_1.1.1               zoo_1.8-9                   haven_2.3.1                 SummarizedExperiment_1.18.1 ggrepel_0.9.1               cluster_2.1.0              
##  [85] here_1.0.1                  magrittr_2.0.1              scattermore_0.7             openxlsx_4.2.4              lmtest_0.9-38               RANN_2.6.1                  SnowballC_0.7.0             fitdistrplus_1.1-5          matrixStats_0.59.0          hms_0.5.3                   mime_0.11                   evaluate_0.14               xtable_1.8-4                XML_3.99-0.3                rio_0.5.16                  sparsesvd_0.2               gridExtra_2.3               compiler_4.0.3              tibble_3.1.2                KernSmooth_2.23-17          crayon_1.4.1                R.oo_1.24.0                 htmltools_0.5.1.1           mgcv_1.8-33                 later_1.2.0                 tidyr_1.1.3                 DBI_1.1.0                   tweenr_1.0.1                MASS_7.3-53                 car_3.0-10                  Matrix_1.3-4                readr_1.4.0                 R.methodsS3_1.8.1           igraph_1.2.6                forcats_0.5.0               pkgconfig_2.0.3             GenomicAlignments_1.24.0    TFMPvalue_0.0.8             foreign_0.8-80              plotly_4.9.4.1              spatstat.sparse_2.0-0       annotate_1.66.0            
## [127] DirichletMultinomial_1.30.0 stringr_1.4.0               digest_0.6.27               sctransform_0.3.2           RcppAnnoy_0.0.18            pracma_2.2.9                CNEr_1.24.0                 spatstat.data_2.1-0         cellranger_1.1.0            rmarkdown_2.5               leiden_0.3.8                fastmatch_1.1-0             uwot_0.1.10                 curl_4.3.2                  shiny_1.6.0                 Rsamtools_2.4.0             gtools_3.9.2                lifecycle_1.0.0             nlme_3.1-150                jsonlite_1.7.2              carData_3.0-4               viridisLite_0.4.0           fansi_0.5.0                 pillar_1.6.1                lattice_0.20-41             KEGGREST_1.28.0             fastmap_1.1.0               httr_1.4.2                  survival_3.2-7              GO.db_3.11.4                glue_1.4.2                  zip_2.1.1                   qlcMatrix_0.9.7             png_0.1-7                   bit_4.0.4                   ggforce_0.3.2               stringi_1.6.2               blob_1.2.1                  caTools_1.18.2              memoise_1.1.0               irlba_2.3.3                 future.apply_1.7.0